import packages

read in file

we want to subset the dataset, since here are variables and values that we don’t need to include.

Values to leave out: - Data from years 2010-2011. There a lot of missing data from those years since air pollution was not a large concern then.

Variables to leave out: - PM measurements from the Jingan and Xuhui regions (PM_Jingan, PM_Xuhui). There are less reliable than PM measurements from the US consulate.

explanatory data analysis with each variable against outcome variable, PM_US.Post

par(mfrow=c(3,4))
plot(x = year, y = PM_US.Post, main="Year VS PM2.5 Concentration", xlab="Year", ylab="PM2.5 Concentration (ug/m^3)")
boxplot(PM_US.Post~month,data=df, main="Month VS PM2.5 Concentration", xlab="Month", ylab="PM2.5 Concentration (ug/m^3)")
boxplot(PM_US.Post~day,data=df, main="Day VS PM2.5 Concentration", xlab="Day", ylab="PM2.5 Concentration (ug/m^3)")
boxplot(PM_US.Post~hour,data=df, main="Hour VS PM2.5 Concentration", xlab="Hour", ylab="PM2.5 Concentration (ug/m^3)")
boxplot(PM_US.Post~as.factor(season),data=df, main="Season VS PM2.5 Concentration", xlab="Season (Spring, Summer, Fall, Winter)", ylab="PM2.5 Concentration (ug/m^3)")
plot(x = DEWP, y = PM_US.Post, main="Dew Point VS PM2.5 Concentration", xlab="Dew Point (ºC)", ylab="PM2.5 Concentration (ug/m^3)")
plot(x = HUMI, y = PM_US.Post, main="Humidity VS PM2.5 Concentration", xlab="Humidity (%)", ylab="PM2.5 Concentration (ug/m^3)")
plot(x = PRES, y = PM_US.Post, main="Pressure VS PM2.5 Concentration", xlab="Pressure (hPa)", ylab="PM2.5 Concentration (ug/m^3)")
plot(x = TEMP, y = PM_US.Post, main="Temperature VS PM2.5 Concentration", xlab="Temperature (ºC)", ylab="PM2.5 Concentration (ug/m^3)")
boxplot(PM_US.Post~cbwd,data=df, main="Combined Wind Direction VS PM2.5 Concentration", xlab="Combined Wind Direction", ylab="PM2.5 Concentration (ug/m^3)")
plot(x = Iws, y = PM_US.Post, main="Cumulated Wind Speed VS PM2.5 Concentration", xlab="Cumulated wind speed (m/s)", ylab="PM2.5 Concentration (ug/m^3)")
plot(x = precipitation, y = PM_US.Post, main="Hourly Percipitation VS PM2.5 Concentration", xlab="Hourly Precipitation (mm)", ylab="PM2.5 Concentration (ug/m^3)")

plot(x = Iprec, y = PM_US.Post, main="Cumulated Percipitation VS PM2.5 Concentration", xlab="Cumulated precipitation (mm)", ylab="PM2.5 Concentration (ug/m^3)")

# make first basic linear regression with all continous variables, no transformation
reg1= lm(PM_US.Post ~ year + as.factor(month) + as.factor(day) + as.factor(hour) + as.factor(season) + 
    DEWP + HUMI + PRES + TEMP + as.factor(cbwd) + Iws + precipitation + Iprec, data = df)
summary(reg1)
## 
## Call:
## lm(formula = PM_US.Post ~ year + as.factor(month) + as.factor(day) + 
##     as.factor(hour) + as.factor(season) + DEWP + HUMI + PRES + 
##     TEMP + as.factor(cbwd) + Iws + precipitation + Iprec, data = df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -88.84 -20.45  -5.98  12.63 594.96 
## 
## Coefficients: (3 not defined because of singularities)
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         4.651e+03  3.703e+02  12.560  < 2e-16 ***
## year               -1.856e+00  1.850e-01 -10.030  < 2e-16 ***
## as.factor(month)2  -2.104e+01  9.703e-01 -21.679  < 2e-16 ***
## as.factor(month)3  -2.619e+01  9.946e-01 -26.334  < 2e-16 ***
## as.factor(month)4  -4.002e+01  1.193e+00 -33.532  < 2e-16 ***
## as.factor(month)5  -4.613e+01  1.363e+00 -33.858  < 2e-16 ***
## as.factor(month)6  -6.351e+01  1.554e+00 -40.876  < 2e-16 ***
## as.factor(month)7  -8.057e+01  1.807e+00 -44.585  < 2e-16 ***
## as.factor(month)8  -8.155e+01  1.738e+00 -46.909  < 2e-16 ***
## as.factor(month)9  -6.815e+01  1.537e+00 -44.342  < 2e-16 ***
## as.factor(month)10 -5.152e+01  1.330e+00 -38.721  < 2e-16 ***
## as.factor(month)11 -2.696e+01  1.092e+00 -24.698  < 2e-16 ***
## as.factor(month)12  2.453e-01  9.644e-01   0.254 0.799189    
## as.factor(day)2    -6.691e-01  1.589e+00  -0.421 0.673781    
## as.factor(day)3    -4.139e+00  1.568e+00  -2.639 0.008320 ** 
## as.factor(day)4     2.291e+00  1.578e+00   1.452 0.146581    
## as.factor(day)5    -6.101e-01  1.576e+00  -0.387 0.698736    
## as.factor(day)6     7.906e+00  1.577e+00   5.014 5.36e-07 ***
## as.factor(day)7     5.500e+00  1.573e+00   3.496 0.000473 ***
## as.factor(day)8     2.225e+00  1.585e+00   1.404 0.160267    
## as.factor(day)9     3.899e-01  1.576e+00   0.247 0.804586    
## as.factor(day)10    2.984e+00  1.574e+00   1.896 0.057985 .  
## as.factor(day)11    3.865e+00  1.576e+00   2.453 0.014172 *  
## as.factor(day)12   -4.410e-01  1.587e+00  -0.278 0.781109    
## as.factor(day)13   -4.378e+00  1.588e+00  -2.757 0.005833 ** 
## as.factor(day)14    1.857e+00  1.574e+00   1.180 0.238123    
## as.factor(day)15    1.251e+00  1.571e+00   0.796 0.425860    
## as.factor(day)16   -5.062e-01  1.562e+00  -0.324 0.745901    
## as.factor(day)17   -9.869e-01  1.566e+00  -0.630 0.528653    
## as.factor(day)18   -2.417e+00  1.577e+00  -1.533 0.125299    
## as.factor(day)19    3.033e+00  1.587e+00   1.911 0.056039 .  
## as.factor(day)20    4.027e+00  1.576e+00   2.556 0.010606 *  
## as.factor(day)21   -2.009e+00  1.584e+00  -1.268 0.204781    
## as.factor(day)22   -1.095e+00  1.580e+00  -0.693 0.488370    
## as.factor(day)23    2.701e+00  1.597e+00   1.691 0.090928 .  
## as.factor(day)24    1.812e+00  1.591e+00   1.139 0.254722    
## as.factor(day)25   -1.252e+00  1.575e+00  -0.795 0.426560    
## as.factor(day)26    2.061e+00  1.577e+00   1.307 0.191347    
## as.factor(day)27   -4.084e+00  1.584e+00  -2.578 0.009929 ** 
## as.factor(day)28   -1.747e+00  1.574e+00  -1.109 0.267232    
## as.factor(day)29    7.766e-02  1.602e+00   0.048 0.961329    
## as.factor(day)30    3.247e+00  1.624e+00   2.000 0.045530 *  
## as.factor(day)31   -3.892e+00  1.830e+00  -2.127 0.033449 *  
## as.factor(hour)1   -2.010e-01  1.404e+00  -0.143 0.886149    
## as.factor(hour)2   -1.247e+00  1.407e+00  -0.886 0.375555    
## as.factor(hour)3   -2.102e+00  1.412e+00  -1.489 0.136585    
## as.factor(hour)4   -2.092e+00  1.415e+00  -1.478 0.139402    
## as.factor(hour)5   -2.044e+00  1.414e+00  -1.446 0.148196    
## as.factor(hour)6    2.788e-01  1.410e+00   0.198 0.843225    
## as.factor(hour)7    8.426e-01  1.400e+00   0.602 0.547263    
## as.factor(hour)8    1.144e+00  1.393e+00   0.821 0.411755    
## as.factor(hour)9   -1.819e-01  1.400e+00  -0.130 0.896637    
## as.factor(hour)10  -1.135e+00  1.413e+00  -0.803 0.421949    
## as.factor(hour)11  -3.222e+00  1.424e+00  -2.263 0.023635 *  
## as.factor(hour)12  -3.965e+00  1.434e+00  -2.764 0.005706 ** 
## as.factor(hour)13  -4.515e+00  1.439e+00  -3.136 0.001713 ** 
## as.factor(hour)14  -4.784e+00  1.443e+00  -3.314 0.000921 ***
## as.factor(hour)15  -3.738e+00  1.441e+00  -2.594 0.009500 ** 
## as.factor(hour)16  -2.205e+00  1.431e+00  -1.541 0.123394    
## as.factor(hour)17  -8.222e-01  1.421e+00  -0.579 0.562777    
## as.factor(hour)18   1.943e+00  1.409e+00   1.379 0.167913    
## as.factor(hour)19   4.082e+00  1.404e+00   2.908 0.003636 ** 
## as.factor(hour)20   5.449e+00  1.402e+00   3.887 0.000102 ***
## as.factor(hour)21   4.072e+00  1.399e+00   2.911 0.003607 ** 
## as.factor(hour)22   3.452e+00  1.400e+00   2.466 0.013685 *  
## as.factor(hour)23   1.666e+00  1.402e+00   1.188 0.234682    
## as.factor(season)2         NA         NA      NA       NA    
## as.factor(season)3         NA         NA      NA       NA    
## as.factor(season)4         NA         NA      NA       NA    
## DEWP                3.505e+00  2.514e-01  13.942  < 2e-16 ***
## HUMI               -8.815e-01  6.479e-02 -13.606  < 2e-16 ***
## PRES               -7.206e-01  6.202e-02 -11.618  < 2e-16 ***
## TEMP               -2.658e+00  2.445e-01 -10.869  < 2e-16 ***
## as.factor(cbwd)NE  -2.243e+01  1.152e+00 -19.467  < 2e-16 ***
## as.factor(cbwd)NW   5.346e+00  1.189e+00   4.495 7.00e-06 ***
## as.factor(cbwd)SE  -2.213e+01  1.175e+00 -18.836  < 2e-16 ***
## as.factor(cbwd)SW  -4.802e+00  1.260e+00  -3.811 0.000139 ***
## Iws                -9.519e-02  2.883e-03 -33.015  < 2e-16 ***
## precipitation      -6.658e-01  2.076e-01  -3.207 0.001342 ** 
## Iprec              -3.023e-01  3.071e-02  -9.843  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 35.72 on 31727 degrees of freedom
## Multiple R-squared:  0.3092, Adjusted R-squared:  0.3076 
## F-statistic: 186.9 on 76 and 31727 DF,  p-value: < 2.2e-16
# intuitively, we can remove the day variable since day of the month shouldn't really correlate with PM levels.
# some variables would be easier to interpret if mean centered, or have its values slightly changed in other ways

# dew point, humidity, pressure: to mean-center. We can interpret weather with average dew point, humidity & pressure
DEWP.c = DEWP - mean(DEWP)
HUMI.c = HUMI - mean(HUMI)
PRES.c = PRES - mean(PRES)
TEMP.c = TEMP - mean(TEMP)

# year: -2012, setting year 2012 as the baseline
year.n = year - 2012

# hour: changing it to a binary categorical variable, rush hour (7-10, 16-19 according to Shanghai Rush Hour Highway Regulations) & non-rush hour
n = nrow(df)

rush <- c(7:10, 16:19)

rushhour = rep(0, n)
rushhour[hour %in% rush] = 1

nonrushhour = rep(1, n)
nonrushhour[hour %in% rush] = 0

# since months and seasons basically have the same trend, this caused our regression to generate NAs.

# summary: adjusted some variables to aid interpretation
# next steps: fit linear regression with the above adjusted variables, and either seasons or months variable depending on which one fits better.
# regression with adjusted values & month variable, no season variable
reg2 = lm(PM_US.Post ~ year.n + as.factor(month) +  rushhour + nonrushhour +
    DEWP.c + HUMI.c + PRES.c + TEMP.c + as.factor(cbwd) + Iws + precipitation + Iprec, data = df)
summary(reg2)
## 
## Call:
## lm(formula = PM_US.Post ~ year.n + as.factor(month) + rushhour + 
##     nonrushhour + DEWP.c + HUMI.c + PRES.c + TEMP.c + as.factor(cbwd) + 
##     Iws + precipitation + Iprec, data = df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -85.26 -20.61  -6.08  12.54 596.68 
## 
## Coefficients: (1 not defined because of singularities)
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        113.99900    1.46587  77.769  < 2e-16 ***
## year.n              -1.82193    0.18572  -9.810  < 2e-16 ***
## as.factor(month)2  -21.08202    0.97198 -21.690  < 2e-16 ***
## as.factor(month)3  -25.50259    0.99159 -25.719  < 2e-16 ***
## as.factor(month)4  -38.21605    1.16952 -32.677  < 2e-16 ***
## as.factor(month)5  -43.97492    1.31862 -33.349  < 2e-16 ***
## as.factor(month)6  -60.87114    1.50588 -40.422  < 2e-16 ***
## as.factor(month)7  -77.13446    1.72814 -44.634  < 2e-16 ***
## as.factor(month)8  -78.21257    1.65849 -47.159  < 2e-16 ***
## as.factor(month)9  -65.09787    1.46996 -44.285  < 2e-16 ***
## as.factor(month)10 -49.29032    1.28126 -38.470  < 2e-16 ***
## as.factor(month)11 -25.48690    1.07593 -23.688  < 2e-16 ***
## as.factor(month)12   0.53096    0.96804   0.548 0.583359    
## rushhour             1.45581    0.42960   3.389 0.000703 ***
## nonrushhour               NA         NA      NA       NA    
## DEWP.c               3.83552    0.24829  15.448  < 2e-16 ***
## HUMI.c              -0.94115    0.06365 -14.786  < 2e-16 ***
## PRES.c              -0.72969    0.06049 -12.063  < 2e-16 ***
## TEMP.c              -3.14343    0.24025 -13.084  < 2e-16 ***
## as.factor(cbwd)NE  -22.03000    1.14790 -19.192  < 2e-16 ***
## as.factor(cbwd)NW    5.02584    1.18854   4.229 2.36e-05 ***
## as.factor(cbwd)SE  -21.05242    1.17118 -17.975  < 2e-16 ***
## as.factor(cbwd)SW   -4.51206    1.26290  -3.573 0.000354 ***
## Iws                 -0.09517    0.00288 -33.050  < 2e-16 ***
## precipitation       -0.71554    0.20821  -3.437 0.000590 ***
## Iprec               -0.30198    0.03051  -9.897  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 35.89 on 31779 degrees of freedom
## Multiple R-squared:  0.3013, Adjusted R-squared:  0.3008 
## F-statistic: 571.1 on 24 and 31779 DF,  p-value: < 2.2e-16
# R^2: 0.3013

# regression with adjusted values & season variable, no month variable
reg3 = lm(PM_US.Post ~ year.n + as.factor(season) +  rushhour + nonrushhour +
    DEWP.c + HUMI.c + PRES.c + TEMP.c + as.factor(cbwd) + Iws + precipitation + Iprec, data = df)
summary(reg3)
## 
## Call:
## lm(formula = PM_US.Post ~ year.n + as.factor(season) + rushhour + 
##     nonrushhour + DEWP.c + HUMI.c + PRES.c + TEMP.c + as.factor(cbwd) + 
##     Iws + precipitation + Iprec, data = df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -85.98 -21.53  -6.79  12.44 584.62 
## 
## Coefficients: (1 not defined because of singularities)
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         75.958328   1.228418  61.834  < 2e-16 ***
## year.n              -1.568377   0.190209  -8.246  < 2e-16 ***
## as.factor(season)2 -19.819548   0.788907 -25.123  < 2e-16 ***
## as.factor(season)3  -6.218897   0.699650  -8.889  < 2e-16 ***
## as.factor(season)4  15.953218   0.734993  21.705  < 2e-16 ***
## rushhour             2.056458   0.441807   4.655 3.26e-06 ***
## nonrushhour                NA         NA      NA       NA    
## DEWP.c               2.901329   0.252993  11.468  < 2e-16 ***
## HUMI.c              -0.819754   0.065183 -12.576  < 2e-16 ***
## PRES.c              -0.588082   0.059037  -9.961  < 2e-16 ***
## TEMP.c              -3.489853   0.246837 -14.138  < 2e-16 ***
## as.factor(cbwd)NE  -22.972442   1.178931 -19.486  < 2e-16 ***
## as.factor(cbwd)NW    5.362890   1.221952   4.389 1.14e-05 ***
## as.factor(cbwd)SE  -20.111706   1.203243 -16.715  < 2e-16 ***
## as.factor(cbwd)SW   -1.636168   1.295060  -1.263   0.2065    
## Iws                 -0.096956   0.002948 -32.892  < 2e-16 ***
## precipitation       -0.665778   0.214350  -3.106   0.0019 ** 
## Iprec               -0.286124   0.031333  -9.132  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 36.96 on 31787 degrees of freedom
## Multiple R-squared:  0.2591, Adjusted R-squared:  0.2588 
## F-statistic: 694.8 on 16 and 31787 DF,  p-value: < 2.2e-16
# R^2: 0.2591

# the regression with months leads to a better model, as indicated by a 5% higher Rsquared value.
# intuitively, using a month variable instead of a season variable would lead to more precise predictions.
# we will get rid of the season variable and just use months as a categorical variable.
#check for multicollinearity between all continous variables
cont <- c(2, 8:11, 13:15)
contVars = df[cont]
cor(contVars)
##                        year          DEWP        HUMI        PRES
## year           1.0000000000  1.723402e-02  0.01399557  0.01825623
## DEWP           0.0172340233  1.000000e+00  0.41646008 -0.85850461
## HUMI           0.0139955717  4.164601e-01  1.00000000 -0.22420056
## PRES           0.0182562321 -8.585046e-01 -0.22420056  1.00000000
## TEMP           0.0137383992  8.841011e-01 -0.04750197 -0.83909624
## Iws           -0.0597930399 -4.870014e-05  0.03955909  0.01072049
## precipitation -0.0004504604  8.737077e-02  0.15023012 -0.09335344
## Iprec         -0.0117851657  8.081718e-02  0.16947617 -0.08746455
##                      TEMP           Iws precipitation       Iprec
## year           0.01373840 -5.979304e-02 -0.0004504604 -0.01178517
## DEWP           0.88410110 -4.870014e-05  0.0873707654  0.08081718
## HUMI          -0.04750197  3.955909e-02  0.1502301226  0.16947617
## PRES          -0.83909624  1.072049e-02 -0.0933534449 -0.08746455
## TEMP           1.00000000 -2.475607e-02  0.0289201440  0.01421182
## Iws           -0.02475607  1.000000e+00  0.0443211028  0.05878068
## precipitation  0.02892014  4.432110e-02  1.0000000000  0.43006888
## Iprec          0.01421182  5.878068e-02  0.4300688789  1.00000000
# multicollinearity (>0.8) between: pressure & dew point, pressure & temperature, dew point & temperature
# multicollinear variables: pressure, dew point, temperature
# this makes intuitive sense because the three weather factors are almost direct factors of each other
# fortunately, these variables are not messing up the standard errors, so we don't have to remove them

# summary: after looking at multi-collinearity values bewteen variables, we can consider removing: pressure, dew point. 
# next step: We will see which multicollinearity variables we can remove by manually doing some f-tests.
# we will manually use nested f-tests to see if some of the variables can be removed from our regression
# variables to test: pressure, dew point, temperature, days

# reg2 = lm(PM_US.Post ~ year.n + as.factor(month) +  rushhour + nonrushhour +
#     DEWP.c + HUMI.c + PRES.c + TEMP + as.factor(cbwd) + Iws + precipitation + Iprec, data = df)

# regression with no pressure variable
reg_noPRES = lm(PM_US.Post ~ year.n + as.factor(month) +  rushhour + nonrushhour +
    DEWP.c + HUMI.c + TEMP + as.factor(cbwd) + Iws + precipitation + Iprec, data = df)
# regression with no dew point variable
reg_noDEWP = lm(PM_US.Post ~ year.n + as.factor(month) +  rushhour + nonrushhour +
    HUMI.c + PRES.c + TEMP + as.factor(cbwd) + Iws + precipitation + Iprec, data = df)
#regression with no temperature
reg_noTEMP = lm(PM_US.Post ~ year.n + as.factor(month) +  rushhour + nonrushhour +
    DEWP.c + HUMI.c + PRES.c + as.factor(cbwd) + Iws + precipitation + Iprec, data = df)

# do ANOVA f-test on each of the regressions (VS original regression)
anova(reg2, reg_noPRES)
## Analysis of Variance Table
## 
## Model 1: PM_US.Post ~ year.n + as.factor(month) + rushhour + nonrushhour + 
##     DEWP.c + HUMI.c + PRES.c + TEMP.c + as.factor(cbwd) + Iws + 
##     precipitation + Iprec
## Model 2: PM_US.Post ~ year.n + as.factor(month) + rushhour + nonrushhour + 
##     DEWP.c + HUMI.c + TEMP + as.factor(cbwd) + Iws + precipitation + 
##     Iprec
##   Res.Df      RSS Df Sum of Sq      F    Pr(>F)    
## 1  31779 40939572                                  
## 2  31780 41127039 -1   -187467 145.52 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(reg2, reg_noDEWP)
## Analysis of Variance Table
## 
## Model 1: PM_US.Post ~ year.n + as.factor(month) + rushhour + nonrushhour + 
##     DEWP.c + HUMI.c + PRES.c + TEMP.c + as.factor(cbwd) + Iws + 
##     precipitation + Iprec
## Model 2: PM_US.Post ~ year.n + as.factor(month) + rushhour + nonrushhour + 
##     HUMI.c + PRES.c + TEMP + as.factor(cbwd) + Iws + precipitation + 
##     Iprec
##   Res.Df      RSS Df Sum of Sq      F    Pr(>F)    
## 1  31779 40939572                                  
## 2  31780 41247004 -1   -307433 238.64 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(reg2, reg_noTEMP)
## Analysis of Variance Table
## 
## Model 1: PM_US.Post ~ year.n + as.factor(month) + rushhour + nonrushhour + 
##     DEWP.c + HUMI.c + PRES.c + TEMP.c + as.factor(cbwd) + Iws + 
##     precipitation + Iprec
## Model 2: PM_US.Post ~ year.n + as.factor(month) + rushhour + nonrushhour + 
##     DEWP.c + HUMI.c + PRES.c + as.factor(cbwd) + Iws + precipitation + 
##     Iprec
##   Res.Df      RSS Df Sum of Sq     F    Pr(>F)    
## 1  31779 40939572                                 
## 2  31780 41160118 -1   -220546 171.2 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# summary: we aren't able to manually remove any of the multicollinear variables by f-tests
# next step: try automatically removing variables by backwards selection
# first backwards selection: removed baseline nonrushhour variable
reg4 <- step(reg2,direction="backward") 
## Start:  AIC=227774.9
## PM_US.Post ~ year.n + as.factor(month) + rushhour + nonrushhour + 
##     DEWP.c + HUMI.c + PRES.c + TEMP.c + as.factor(cbwd) + Iws + 
##     precipitation + Iprec
## 
## 
## Step:  AIC=227774.9
## PM_US.Post ~ year.n + as.factor(month) + rushhour + DEWP.c + 
##     HUMI.c + PRES.c + TEMP.c + as.factor(cbwd) + Iws + precipitation + 
##     Iprec
## 
##                    Df Sum of Sq      RSS    AIC
## <none>                          40939572 227775
## - rushhour          1     14794 40954366 227784
## - precipitation     1     15215 40954787 227785
## - year.n            1    123974 41063546 227869
## - Iprec             1    126185 41065756 227871
## - PRES.c            1    187467 41127039 227918
## - TEMP.c            1    220546 41160118 227944
## - HUMI.c            1    281632 41221204 227991
## - DEWP.c            1    307433 41247004 228011
## - Iws               1   1407130 42346701 228848
## - as.factor(cbwd)   4   3420722 44360294 230319
## - as.factor(month) 11   3935770 44875342 230672
summary(reg4)
## 
## Call:
## lm(formula = PM_US.Post ~ year.n + as.factor(month) + rushhour + 
##     DEWP.c + HUMI.c + PRES.c + TEMP.c + as.factor(cbwd) + Iws + 
##     precipitation + Iprec, data = df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -85.26 -20.61  -6.08  12.54 596.68 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        113.99900    1.46587  77.769  < 2e-16 ***
## year.n              -1.82193    0.18572  -9.810  < 2e-16 ***
## as.factor(month)2  -21.08202    0.97198 -21.690  < 2e-16 ***
## as.factor(month)3  -25.50259    0.99159 -25.719  < 2e-16 ***
## as.factor(month)4  -38.21605    1.16952 -32.677  < 2e-16 ***
## as.factor(month)5  -43.97492    1.31862 -33.349  < 2e-16 ***
## as.factor(month)6  -60.87114    1.50588 -40.422  < 2e-16 ***
## as.factor(month)7  -77.13446    1.72814 -44.634  < 2e-16 ***
## as.factor(month)8  -78.21257    1.65849 -47.159  < 2e-16 ***
## as.factor(month)9  -65.09787    1.46996 -44.285  < 2e-16 ***
## as.factor(month)10 -49.29032    1.28126 -38.470  < 2e-16 ***
## as.factor(month)11 -25.48690    1.07593 -23.688  < 2e-16 ***
## as.factor(month)12   0.53096    0.96804   0.548 0.583359    
## rushhour             1.45581    0.42960   3.389 0.000703 ***
## DEWP.c               3.83552    0.24829  15.448  < 2e-16 ***
## HUMI.c              -0.94115    0.06365 -14.786  < 2e-16 ***
## PRES.c              -0.72969    0.06049 -12.063  < 2e-16 ***
## TEMP.c              -3.14343    0.24025 -13.084  < 2e-16 ***
## as.factor(cbwd)NE  -22.03000    1.14790 -19.192  < 2e-16 ***
## as.factor(cbwd)NW    5.02584    1.18854   4.229 2.36e-05 ***
## as.factor(cbwd)SE  -21.05242    1.17118 -17.975  < 2e-16 ***
## as.factor(cbwd)SW   -4.51206    1.26290  -3.573 0.000354 ***
## Iws                 -0.09517    0.00288 -33.050  < 2e-16 ***
## precipitation       -0.71554    0.20821  -3.437 0.000590 ***
## Iprec               -0.30198    0.03051  -9.897  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 35.89 on 31779 degrees of freedom
## Multiple R-squared:  0.3013, Adjusted R-squared:  0.3008 
## F-statistic: 571.1 on 24 and 31779 DF,  p-value: < 2.2e-16
# since no real variables have been removed, the two models are the same. the anova p-value is 0, indicates no change
anova(reg2, reg4)
## Analysis of Variance Table
## 
## Model 1: PM_US.Post ~ year.n + as.factor(month) + rushhour + nonrushhour + 
##     DEWP.c + HUMI.c + PRES.c + TEMP.c + as.factor(cbwd) + Iws + 
##     precipitation + Iprec
## Model 2: PM_US.Post ~ year.n + as.factor(month) + rushhour + DEWP.c + 
##     HUMI.c + PRES.c + TEMP.c + as.factor(cbwd) + Iws + precipitation + 
##     Iprec
##   Res.Df      RSS Df Sum of Sq F Pr(>F)
## 1  31779 40939572                      
## 2  31779 40939572  0         0
# no further variables removed (other than clarifying baseline for nonrushhour variable), nothing smaller than the given AIC
# as shown in the exploratory data analysis, we should do some transformations as many linear regression assumptions are violated
# have to add 0.1 to transformed x-variables since we cannot log (0)
# x-variables to transform (non-linearity): log(Iws+0.1), log(precipitation+0.1), log(Iprec+0.1)
Iws.log = log(Iws+0.1)
precipitation.log = log(precipitation+0.1)
Iprec.log = log(Iprec+0.1)


# x-variables to square (non-linearity)
DEWP.c.2 = DEWP.c^2
HUMI.c.2 = HUMI.c^2
PRES.c.2 = PRES.c^2
TEMP.c.2 = TEMP.c^2

# y-varable to transform (non-constant variance): log(PM_US.Post)
PM_US.Post.log = log(PM_US.Post)

# try with these transformations
boxplot(log(PM_US.Post)~month,data=df, main="Month VS log PM2.5 Concentration", xlab="Month", ylab="log PM2.5 Concentration (ug/m^3)")

boxplot(log(PM_US.Post)~rushhour,data=df, main="Rush Hour VS log PM2.5 Concentration", xlab="Rush Hour", ylab="log PM2.5 Concentration (ug/m^3)")

plot(x = DEWP.c.2 + DEWP.c, y = log(PM_US.Post), main="Dew Point Centered VS log PM2.5 Concentration", xlab="Dew Point Centered (ºC)", ylab="log PM2.5 Concentration (ug/m^3)")

plot(x = HUMI.c.2 + HUMI.c, y = log(PM_US.Post), main="Humidity Centered VS PM2.5 Concentration", xlab="Humidity Centered (%)", ylab="log PM2.5 Concentration (ug/m^3)")

plot(x = PRES.c.2 + PRES.c, y = log(PM_US.Post), main="Pressure Centered VS PM2.5 Concentration", xlab="Pressure Centered (hPa)", ylab="log PM2.5 Concentration (ug/m^3)")

plot(x = TEMP.c.2 + TEMP.c, y = log(PM_US.Post), main="Temperature VS log PM2.5 Concentration", xlab="Temperature (ºC)", ylab="log PM2.5 Concentration (ug/m^3)")

boxplot(log(PM_US.Post)~cbwd,data=df, main="Combined Wind Direction VS log PM2.5 Concentration", xlab="Combined Wind Direction", ylab="log PM2.5 Concentration (ug/m^3)")

plot(x = log(Iws+0.1), y = log(PM_US.Post), main="log Cumulated Wind Speed VS log PM2.5 Concentration", xlab="log Cumulated wind speed (m/s)", ylab="log PM2.5 Concentration (ug/m^3)")

plot(x = log(precipitation+0.1), y = log(PM_US.Post), main="log Hourly Percipitation VS log PM2.5 Concentration", xlab="log Hourly Precipitation (mm)", ylab="log PM2.5 Concentration (ug/m^3)")

plot(x = log(Iprec+0.1), y = log(PM_US.Post), main="log Cumulated Percipitation VS log PM2.5 Concentration", xlab="log Cumulated precipitation (mm)", ylab="log PM2.5 Concentration (ug/m^3)")

# these transformations are not ideal but they're fine

# overall, they fix the non-linear and non-constant variance violations we were concerned about
# regression with just log y
reg5 = lm(PM_US.Post.log ~ year + as.factor(month) + rushhour + DEWP.c + 
    HUMI.c + PRES.c + TEMP.c + as.factor(cbwd) + Iws + precipitation + 
    Iprec, data = df)
summary(reg5)
## 
## Call:
## lm(formula = PM_US.Post.log ~ year + as.factor(month) + rushhour + 
##     DEWP.c + HUMI.c + PRES.c + TEMP.c + as.factor(cbwd) + Iws + 
##     precipitation + Iprec, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.0838 -0.3750  0.0156  0.4024  2.7291 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         1.489e+01  6.495e+00   2.293   0.0218 *  
## year               -5.075e-03  3.226e-03  -1.573   0.1156    
## as.factor(month)2  -2.781e-01  1.688e-02 -16.477  < 2e-16 ***
## as.factor(month)3  -3.029e-01  1.722e-02 -17.587  < 2e-16 ***
## as.factor(month)4  -4.497e-01  2.031e-02 -22.141  < 2e-16 ***
## as.factor(month)5  -5.700e-01  2.290e-02 -24.888  < 2e-16 ***
## as.factor(month)6  -9.656e-01  2.615e-02 -36.922  < 2e-16 ***
## as.factor(month)7  -1.462e+00  3.001e-02 -48.718  < 2e-16 ***
## as.factor(month)8  -1.534e+00  2.880e-02 -53.269  < 2e-16 ***
## as.factor(month)9  -1.163e+00  2.553e-02 -45.572  < 2e-16 ***
## as.factor(month)10 -7.567e-01  2.225e-02 -34.004  < 2e-16 ***
## as.factor(month)11 -3.149e-01  1.869e-02 -16.852  < 2e-16 ***
## as.factor(month)12 -4.991e-03  1.681e-02  -0.297   0.7666    
## rushhour            5.029e-02  7.461e-03   6.740 1.61e-11 ***
## DEWP.c              6.233e-02  4.312e-03  14.454  < 2e-16 ***
## HUMI.c             -1.757e-02  1.105e-03 -15.896  < 2e-16 ***
## PRES.c             -1.316e-02  1.051e-03 -12.529  < 2e-16 ***
## TEMP.c             -5.363e-02  4.172e-03 -12.853  < 2e-16 ***
## as.factor(cbwd)NE  -3.782e-01  1.994e-02 -18.970  < 2e-16 ***
## as.factor(cbwd)NW   9.796e-02  2.064e-02   4.746 2.09e-06 ***
## as.factor(cbwd)SE  -3.763e-01  2.034e-02 -18.500  < 2e-16 ***
## as.factor(cbwd)SW  -2.096e-02  2.193e-02  -0.956   0.3392    
## Iws                -2.435e-03  5.001e-05 -48.695  < 2e-16 ***
## precipitation      -1.430e-02  3.616e-03  -3.955 7.66e-05 ***
## Iprec              -1.040e-02  5.299e-04 -19.618  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6234 on 31779 degrees of freedom
## Multiple R-squared:  0.4013, Adjusted R-squared:  0.4008 
## F-statistic: 887.5 on 24 and 31779 DF,  p-value: < 2.2e-16
# R^2: 0.4012

# regression with log y & x^2
reg6 = lm(PM_US.Post.log ~ year + as.factor(month) + rushhour + DEWP.c + DEWP.c.2 +
    HUMI.c + HUMI.c.2 + PRES.c + PRES.c.2 + TEMP.c + TEMP.c.2 + as.factor(cbwd) + Iws + precipitation + 
    Iprec, data = df)
summary(reg6)
## 
## Call:
## lm(formula = PM_US.Post.log ~ year + as.factor(month) + rushhour + 
##     DEWP.c + DEWP.c.2 + HUMI.c + HUMI.c.2 + PRES.c + PRES.c.2 + 
##     TEMP.c + TEMP.c.2 + as.factor(cbwd) + Iws + precipitation + 
##     Iprec, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.0263 -0.3706  0.0167  0.4030  2.6933 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         6.202e+00  6.528e+00   0.950  0.34211    
## year               -7.102e-04  3.242e-03  -0.219  0.82661    
## as.factor(month)2  -2.994e-01  1.685e-02 -17.769  < 2e-16 ***
## as.factor(month)3  -3.164e-01  1.750e-02 -18.083  < 2e-16 ***
## as.factor(month)4  -4.504e-01  2.071e-02 -21.747  < 2e-16 ***
## as.factor(month)5  -5.287e-01  2.303e-02 -22.960  < 2e-16 ***
## as.factor(month)6  -8.330e-01  2.711e-02 -30.731  < 2e-16 ***
## as.factor(month)7  -1.302e+00  3.259e-02 -39.963  < 2e-16 ***
## as.factor(month)8  -1.376e+00  3.147e-02 -43.715  < 2e-16 ***
## as.factor(month)9  -1.086e+00  2.655e-02 -40.895  < 2e-16 ***
## as.factor(month)10 -7.340e-01  2.278e-02 -32.227  < 2e-16 ***
## as.factor(month)11 -2.989e-01  1.933e-02 -15.464  < 2e-16 ***
## as.factor(month)12  1.764e-02  1.681e-02   1.049  0.29404    
## rushhour            5.288e-02  7.415e-03   7.132 1.01e-12 ***
## DEWP.c             -5.596e-02  1.364e-02  -4.103 4.09e-05 ***
## DEWP.c.2           -3.673e-04  7.527e-05  -4.880 1.06e-06 ***
## HUMI.c              7.336e-03  3.102e-03   2.365  0.01804 *  
## HUMI.c.2           -2.998e-04  3.428e-05  -8.745  < 2e-16 ***
## PRES.c             -1.433e-02  1.055e-03 -13.579  < 2e-16 ***
## PRES.c.2           -4.121e-04  5.763e-05  -7.150 8.89e-13 ***
## TEMP.c              5.143e-02  1.298e-02   3.961 7.47e-05 ***
## TEMP.c.2            1.584e-04  7.577e-05   2.090  0.03658 *  
## as.factor(cbwd)NE  -3.811e-01  1.987e-02 -19.175  < 2e-16 ***
## as.factor(cbwd)NW   1.111e-01  2.053e-02   5.412 6.28e-08 ***
## as.factor(cbwd)SE  -3.785e-01  2.028e-02 -18.660  < 2e-16 ***
## as.factor(cbwd)SW  -2.230e-02  2.182e-02  -1.022  0.30676    
## Iws                -2.381e-03  4.999e-05 -47.624  < 2e-16 ***
## precipitation      -9.495e-03  3.603e-03  -2.635  0.00841 ** 
## Iprec              -9.259e-03  5.311e-04 -17.433  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6194 on 31775 degrees of freedom
## Multiple R-squared:  0.409,  Adjusted R-squared:  0.4085 
## F-statistic: 785.4 on 28 and 31775 DF,  p-value: < 2.2e-16
# R^2: 0.409

# regression with log y & log x
reg7 = lm(PM_US.Post.log ~ year + as.factor(month) + rushhour + DEWP.c + 
    HUMI.c + PRES.c + TEMP.c + as.factor(cbwd) + Iws.log + precipitation.log + 
    Iprec.log, data = df)
summary(reg7)
## 
## Call:
## lm(formula = PM_US.Post.log ~ year + as.factor(month) + rushhour + 
##     DEWP.c + HUMI.c + PRES.c + TEMP.c + as.factor(cbwd) + Iws.log + 
##     precipitation.log + Iprec.log, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.9620 -0.3731  0.0165  0.4060  2.8044 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         8.896950   6.498430   1.369  0.17098    
## year               -0.002233   0.003227  -0.692  0.48909    
## as.factor(month)2  -0.259151   0.016899 -15.335  < 2e-16 ***
## as.factor(month)3  -0.308689   0.017242 -17.903  < 2e-16 ***
## as.factor(month)4  -0.462195   0.020340 -22.723  < 2e-16 ***
## as.factor(month)5  -0.599272   0.022933 -26.131  < 2e-16 ***
## as.factor(month)6  -1.005824   0.026209 -38.377  < 2e-16 ***
## as.factor(month)7  -1.556281   0.030017 -51.847  < 2e-16 ***
## as.factor(month)8  -1.607217   0.028849 -55.712  < 2e-16 ***
## as.factor(month)9  -1.217443   0.025567 -47.618  < 2e-16 ***
## as.factor(month)10 -0.803986   0.022265 -36.110  < 2e-16 ***
## as.factor(month)11 -0.337652   0.018708 -18.048  < 2e-16 ***
## as.factor(month)12 -0.013597   0.016832  -0.808  0.41922    
## rushhour            0.054326   0.007474   7.269 3.72e-13 ***
## DEWP.c              0.044304   0.004433   9.994  < 2e-16 ***
## HUMI.c             -0.011708   0.001150 -10.180  < 2e-16 ***
## PRES.c             -0.012585   0.001052 -11.961  < 2e-16 ***
## TEMP.c             -0.033409   0.004275  -7.814 5.69e-15 ***
## as.factor(cbwd)NE  -0.046229   0.022496  -2.055  0.03989 *  
## as.factor(cbwd)NW   0.427626   0.022902  18.672  < 2e-16 ***
## as.factor(cbwd)SE  -0.058612   0.022652  -2.587  0.00967 ** 
## as.factor(cbwd)SW   0.272054   0.023293  11.680  < 2e-16 ***
## Iws.log            -0.108802   0.002580 -42.170  < 2e-16 ***
## precipitation.log   0.030167   0.011755   2.566  0.01028 *  
## Iprec.log          -0.123121   0.007241 -17.003  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6241 on 31779 degrees of freedom
## Multiple R-squared:  0.3998, Adjusted R-squared:  0.3994 
## F-statistic: 882.1 on 24 and 31779 DF,  p-value: < 2.2e-16
# R^2: 0.3998

# regression with log y, log x and x^2
reg8 = lm(PM_US.Post.log ~ year + as.factor(month) + rushhour + DEWP.c + DEWP.c.2 +
    HUMI.c + HUMI.c.2 + PRES.c + PRES.c.2 + TEMP.c + TEMP.c.2 + as.factor(cbwd) + Iws.log + precipitation.log + 
    Iprec.log, data = df)
summary(reg8)
## 
## Call:
## lm(formula = PM_US.Post.log ~ year + as.factor(month) + rushhour + 
##     DEWP.c + DEWP.c.2 + HUMI.c + HUMI.c.2 + PRES.c + PRES.c.2 + 
##     TEMP.c + TEMP.c.2 + as.factor(cbwd) + Iws.log + precipitation.log + 
##     Iprec.log, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.8897 -0.3703  0.0161  0.4015  2.7521 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         2.078e+00  6.525e+00   0.318 0.750171    
## year                1.203e-03  3.241e-03   0.371 0.710571    
## as.factor(month)2  -2.878e-01  1.685e-02 -17.077  < 2e-16 ***
## as.factor(month)3  -3.332e-01  1.749e-02 -19.049  < 2e-16 ***
## as.factor(month)4  -4.710e-01  2.072e-02 -22.736  < 2e-16 ***
## as.factor(month)5  -5.535e-01  2.303e-02 -24.035  < 2e-16 ***
## as.factor(month)6  -8.429e-01  2.712e-02 -31.081  < 2e-16 ***
## as.factor(month)7  -1.343e+00  3.259e-02 -41.228  < 2e-16 ***
## as.factor(month)8  -1.398e+00  3.149e-02 -44.396  < 2e-16 ***
## as.factor(month)9  -1.124e+00  2.656e-02 -42.303  < 2e-16 ***
## as.factor(month)10 -7.844e-01  2.276e-02 -34.470  < 2e-16 ***
## as.factor(month)11 -3.331e-01  1.932e-02 -17.239  < 2e-16 ***
## as.factor(month)12  5.897e-03  1.681e-02   0.351 0.725658    
## rushhour            5.696e-02  7.419e-03   7.677 1.67e-14 ***
## DEWP.c             -2.081e-02  1.376e-02  -1.512 0.130520    
## DEWP.c.2           -6.978e-04  7.593e-05  -9.189  < 2e-16 ***
## HUMI.c              7.662e-04  3.118e-03   0.246 0.805902    
## HUMI.c.2           -1.588e-04  3.511e-05  -4.523 6.12e-06 ***
## PRES.c             -1.348e-02  1.055e-03 -12.778  < 2e-16 ***
## PRES.c.2           -5.186e-04  5.749e-05  -9.020  < 2e-16 ***
## TEMP.c              1.930e-02  1.310e-02   1.473 0.140766    
## TEMP.c.2            3.276e-04  7.641e-05   4.287 1.82e-05 ***
## as.factor(cbwd)NE  -5.371e-02  2.239e-02  -2.398 0.016478 *  
## as.factor(cbwd)NW   4.382e-01  2.275e-02  19.261  < 2e-16 ***
## as.factor(cbwd)SE  -6.344e-02  2.255e-02  -2.813 0.004913 ** 
## as.factor(cbwd)SW   2.692e-01  2.314e-02  11.634  < 2e-16 ***
## Iws.log            -1.074e-01  2.568e-03 -41.835  < 2e-16 ***
## precipitation.log   4.046e-02  1.168e-02   3.464 0.000533 ***
## Iprec.log          -1.212e-01  7.257e-03 -16.696  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6194 on 31775 degrees of freedom
## Multiple R-squared:  0.409,  Adjusted R-squared:  0.4084 
## F-statistic: 785.2 on 28 and 31775 DF,  p-value: < 2.2e-16
# R^2: 0.409

# the higher R-squared value from this regression implies that these transformed models are better fits than the previous model
# within the transformed regressions above, there's not much difference in the R^2 values. for interpretation purposes, we will want to work with regression 5 (log y, no x transformations) which will make explaining co-efficients easier. But we need to check with residual plots.

# sumamry: fitted regression with tranformed variables: logy, x^2 (some), logx (some)
# next step: check for interaction variables
# we want to do residual plots for each of the regressions above (reg 5,6,7,8)

# regression 5 (log y, no x transformations)
# variables that didn't need transformations
plot(y = reg5$residual, x=year,  main = "Reg 5, Year VS Residuals", ylab = "Residuals")

boxplot(reg5$residual ~ as.factor(month), main = "Reg 5, Plot of Residuals", ylab = "Residuals")

boxplot(reg5$residual ~ rushhour, main = "Reg 5, Rushhour of Residuals", ylab = "Residuals")

boxplot(reg5$residual ~ as.factor(cbwd), main = "Reg 5, Combined Wind Direction of Residuals", ylab = "Residuals")

# variables possibly to be squared
plot(y = reg5$residual, x=DEWP.c,  main = "Dew Point VS Residuals", ylab = "Residuals")

plot(y = reg5$residual, x=HUMI.c,  main = "Humidity VS Residuals", ylab = "Residuals")

plot(y = reg5$residual, x=PRES.c,  main = "Pressure VS Residuals", ylab = "Residuals")

plot(y = reg5$residual, x=TEMP.c,  main = "Temperature VS Residuals", ylab = "Residuals")

# variables possiblly to be logged
plot(y = reg5$residual, x=Iws,  main = "Cumulative Wind Speed VS Residuals", ylab = "Residuals")

plot(y = reg5$residual, x=precipitation,  main = "Hourly Precipitation VS Residuals", ylab = "Residuals")

plot(y = reg5$residual, x=Iprec,  main = "Cumulative Precipitation VS Residuals", ylab = "Residuals")

# these don't look great, definately need some sort of transformation

# x^2 variables 
plot(y = reg6$residual, x=DEWP.c + DEWP.c.2,  main = "Dew Point^2 VS Residuals", ylab = "Residuals")

plot(y = reg6$residual, x=HUMI.c + HUMI.c.2,  main = "Humidity^2 VS Residuals", ylab = "Residuals")

plot(y = reg6$residual, x=PRES.c + PRES.c.2,  main = "Pressure^2 VS Residuals", ylab = "Residuals")

plot(y = reg6$residual, x=TEMP.c + TEMP.c.2,  main = "Temperature^2 VS Residuals", ylab = "Residuals")

# log x variables
plot(y = reg7$residual, x=Iws.log,  main = "log Cumulative Wind Speed VS Residuals", ylab = "Residuals")

plot(y = reg7$residual, x=precipitation.log,  main = "log Hourly Precipitation VS Residuals", ylab = "Residuals")

plot(y = reg7$residual, x=Iprec.log,  main = "log Cumulative Precipitation VS Residuals", ylab = "Residuals")

# we can use xyplots to eye-ball possible interactions, testing variables as conditionals of various categorical variables 

# variables to test: year, month, rushhour, humidity, temperature (dew point & pressure highly correlated), combined wind direction
# all other variables are continous with too many different levels

# lets see if any variables interact with year
xyplot(PM_US.Post.log ~ as.factor(month) | year)

xyplot(PM_US.Post.log ~ rushhour | year)

xyplot(PM_US.Post.log ~ DEWP.c | year)

xyplot(PM_US.Post.log ~ HUMI.c | year)

xyplot(PM_US.Post.log ~ PRES.c | year)

xyplot(PM_US.Post.log ~ TEMP | year)

xyplot(PM_US.Post.log ~ as.factor(cbwd) | year)

xyplot(PM_US.Post.log ~ Iws.log | year)

xyplot(PM_US.Post.log ~ precipitation.log | year)

xyplot(PM_US.Post.log ~ Iprec.log | year)

#no interactions with year

# lets see if any variables interact with month
xyplot(PM_US.Post.log ~ year | as.factor(month))

xyplot(PM_US.Post.log ~ rushhour | as.factor(month))

xyplot(PM_US.Post.log ~ DEWP.c | as.factor(month))

xyplot(PM_US.Post.log ~ HUMI.c | as.factor(month))

xyplot(PM_US.Post.log ~ PRES.c | as.factor(month))

xyplot(PM_US.Post.log ~ TEMP | as.factor(month))

xyplot(PM_US.Post.log ~ as.factor(cbwd) | as.factor(month))

xyplot(PM_US.Post.log ~ Iws.log | as.factor(month))

xyplot(PM_US.Post.log ~ precipitation.log | as.factor(month))

xyplot(PM_US.Post.log ~ Iprec.log | as.factor(month))

# 

# lets see if any variables interact with rushhour
xyplot(PM_US.Post.log ~ year | rushhour)

xyplot(PM_US.Post.log ~ as.factor(month) | rushhour)

xyplot(PM_US.Post.log ~ DEWP.c | rushhour)

xyplot(PM_US.Post.log ~ HUMI.c | rushhour)

xyplot(PM_US.Post.log ~ PRES.c | rushhour)

xyplot(PM_US.Post.log ~ TEMP | rushhour)

xyplot(PM_US.Post.log ~ as.factor(cbwd) | rushhour)

xyplot(PM_US.Post.log ~ Iws.log | rushhour)

xyplot(PM_US.Post.log ~ precipitation.log | rushhour)

xyplot(PM_US.Post.log ~ Iprec.log | rushhour)

# no variables seems to interact with rushhour

# lets see if any variables interact with humidity
# split humidity into categorical variable with 6 levels
xyplot(PM_US.Post.log ~ year | cut(HUMI.c, 6))

xyplot(PM_US.Post.log ~ as.factor(month) | cut(HUMI.c, 6))

xyplot(PM_US.Post.log ~ rushhour | cut(HUMI.c, 6))

xyplot(PM_US.Post.log ~ DEWP.c | cut(HUMI.c, 6))

xyplot(PM_US.Post.log ~ PRES.c | cut(HUMI.c, 6))

xyplot(PM_US.Post.log ~ TEMP | cut(HUMI.c, 6))

xyplot(PM_US.Post.log ~ as.factor(cbwd) | cut(HUMI.c, 6))

xyplot(PM_US.Post.log ~ Iws.log | cut(HUMI.c, 6))

xyplot(PM_US.Post.log ~ precipitation.log | cut(HUMI.c, 6))

xyplot(PM_US.Post.log ~ Iprec.log | cut(HUMI.c, 6))

#???????

# lets see if any variables interact with temperature
# split temperature  into categorical variable with 6 levels
xyplot(PM_US.Post.log ~ year | cut(TEMP, 6))

xyplot(PM_US.Post.log ~ as.factor(month) | cut(TEMP, 6))

xyplot(PM_US.Post.log ~ rushhour | cut(TEMP, 6))

xyplot(PM_US.Post.log ~ DEWP.c | cut(TEMP, 6))

xyplot(PM_US.Post.log ~ HUMI.c | cut(TEMP, 6))

xyplot(PM_US.Post.log ~ PRES.c | cut(TEMP, 6))

xyplot(PM_US.Post.log ~ as.factor(cbwd) | cut(TEMP, 6))

xyplot(PM_US.Post.log ~ Iws.log | cut(TEMP, 6))

xyplot(PM_US.Post.log ~ precipitation.log | cut(TEMP, 6))

xyplot(PM_US.Post.log ~ Iprec.log | cut(TEMP, 6))

#???????
# temperature has high multicollinearity with pressure & dew point
# we can just use the temperature variable to account for pressure & dew point

# lets see if any variables interact with combined wind direction
xyplot(PM_US.Post.log ~ year | as.factor(cbwd))

xyplot(PM_US.Post.log ~ as.factor(month) | as.factor(cbwd))

xyplot(PM_US.Post.log ~ rushhour | as.factor(cbwd))

xyplot(PM_US.Post.log ~ DEWP.c | as.factor(cbwd))

xyplot(PM_US.Post.log ~ HUMI.c | as.factor(cbwd))

xyplot(PM_US.Post.log ~ PRES.c | as.factor(cbwd))

xyplot(PM_US.Post.log ~ TEMP | as.factor(cbwd))

xyplot(PM_US.Post.log ~ Iws.log | as.factor(cbwd))

xyplot(PM_US.Post.log ~ precipitation.log | as.factor(cbwd))

xyplot(PM_US.Post.log ~ Iprec.log | as.factor(cbwd))

#Iws.log + precipitation.log + Iprec.log

# lets see if any variables interact with log cumulated wind speed
xyplot(PM_US.Post.log ~ year | cut(Iws.log, 6))

xyplot(PM_US.Post.log ~ as.factor(month) | cut(Iws.log, 6))

xyplot(PM_US.Post.log ~ rushhour | cut(Iws.log, 6))

xyplot(PM_US.Post.log ~ DEWP.c | cut(Iws.log, 6))

xyplot(PM_US.Post.log ~ HUMI.c | cut(Iws.log, 6))

xyplot(PM_US.Post.log ~ PRES.c | cut(Iws.log, 6))

xyplot(PM_US.Post.log ~ TEMP | cut(Iws.log, 6))

xyplot(PM_US.Post.log ~ as.factor(cbwd) | cut(Iws.log, 6))

xyplot(PM_US.Post.log ~ precipitation.log | cut(Iws.log, 6))

xyplot(PM_US.Post.log ~ Iprec.log | cut(Iws.log, 6))

# no variables seem to interact with log cumulated wind speed

# lets see if any variables interact with log hourly precipitation
xyplot(PM_US.Post.log ~ year | cut(precipitation.log, 6))

xyplot(PM_US.Post.log ~ as.factor(month) | cut(precipitation.log, 6))

xyplot(PM_US.Post.log ~ rushhour | cut(precipitation.log, 6))

xyplot(PM_US.Post.log ~ DEWP.c | cut(precipitation.log, 6))

xyplot(PM_US.Post.log ~ HUMI.c | cut(precipitation.log, 6))

xyplot(PM_US.Post.log ~ PRES.c | cut(precipitation.log, 6))

xyplot(PM_US.Post.log ~ TEMP | cut(precipitation.log, 6))

xyplot(PM_US.Post.log ~ as.factor(cbwd) | cut(precipitation.log, 6))

xyplot(PM_US.Post.log ~ Iws.log | cut(precipitation.log, 6))

xyplot(PM_US.Post.log ~ precipitation.log | cut(precipitation.log, 6))

xyplot(PM_US.Post.log ~ Iprec.log | cut(precipitation.log, 6))

# no variables seem to interact with log hourly precipitation

# lets see if any variables interact with log cumulated precipitation
xyplot(PM_US.Post.log ~ year | cut(Iprec.log, 6))

xyplot(PM_US.Post.log ~ as.factor(month) | cut(Iprec.log, 6))

xyplot(PM_US.Post.log ~ rushhour | cut(Iprec.log, 6))

xyplot(PM_US.Post.log ~ DEWP.c | cut(Iprec.log, 6))

xyplot(PM_US.Post.log ~ HUMI.c | cut(Iprec.log, 6))

xyplot(PM_US.Post.log ~ PRES.c | cut(Iprec.log, 6))

xyplot(PM_US.Post.log ~ TEMP | cut(Iprec.log, 6))

xyplot(PM_US.Post.log ~ as.factor(cbwd) | cut(Iprec.log, 6))

xyplot(PM_US.Post.log ~ Iws.log | cut(Iprec.log, 6))

xyplot(PM_US.Post.log ~ precipitation.log | cut(Iprec.log, 6))

# no variables seem to interact with log cumulated precipitation

# seems to be interaction between combined wind direction and log cumulated wind speed, make a var for that

# summary: after eye-balling for possible interaction effects amongst almost all of the variables, there doesn't seem to be any interaction effects. No need to make interaction variables.
# next step: decide on final regression and interpret coefficients